{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TextLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content=\"## 创建表\\n\\n```sql\\n# 分区表\\ncreate table test_t2(words string,frequency string) partitioned by (partdate string) row format delimited fields terminated by ',';\\n\\n# orc表\\nCREATE TABLE IF NOT EXISTS bank.account_orc (\\n  `id_card` int,\\n  `tran_time` string,\\n  `name` string,\\n  `cash` int\\n  )\\nstored as orc;\\n```\\n\\n# 插入数据\\n\\n```sql\\ninsert into tablename values('col1', 'col2');\\n\\n\\nINSERT INTO table_name (column1, column2, column3)\\nVALUES\\n(value1, value2, value3),\\n(value4, value5, value6),\\n(value7, value8, value9);\\n\\n\\nINSERT OVERWRITE TABLE tb\\nselect * from tb2\\n;\\n```\", metadata={'source': './examples/sql.md'})]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain_community.document_loaders import TextLoader\n",
    "\n",
    "loader = TextLoader(\"./examples/sql.md\")\n",
    "loader.load()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CSVLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='id: 1\\nname: 张三\\ndegree: 本科', metadata={'source': './examples/test.csv', 'row': 0}),\n",
       " Document(page_content='id: 2\\nname: 李四\\ndegree: 硕士', metadata={'source': './examples/test.csv', 'row': 1})]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
    "\n",
    "\n",
    "loader = CSVLoader(file_path='./examples/test.csv')\n",
    "loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='id: 1\\nname: 张三\\ndegree: 本科', metadata={'source': '1', 'row': 0}),\n",
       " Document(page_content='id: 2\\nname: 李四\\ndegree: 硕士', metadata={'source': '2', 'row': 1})]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loader = CSVLoader(file_path='./examples/no_fields_name.csv', csv_args={\n",
    "    'delimiter': ',',\n",
    "    'quotechar': '\"',\n",
    "    'fieldnames': ['id', 'name', 'degree']\n",
    "    }, \n",
    "    source_column='id'\n",
    ")\n",
    "\n",
    "loader.load()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PyPDFLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Previous trailer can not be read (\"invalid literal for int() with base 10: b'/Root'\",)\n",
      "Object 14 0 found\n",
      "Object 3 0 found\n",
      "Object 2 0 found\n",
      "Object 5 0 found\n",
      "Object 7 0 found\n",
      "Object 21 0 found\n",
      "Object 20 0 found\n",
      "Object 22 0 found\n",
      "Object 8 0 found\n",
      "Object 9 0 found\n",
      "Object 10 0 found\n",
      "Object 30 0 found\n",
      "Object 29 0 found\n",
      "Object 31 0 found\n",
      "Object 12 0 found\n",
      "Object 35 0 found\n",
      "Object 34 0 found\n",
      "Object 4 0 found\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[Document(page_content=\"创建表 \\n插⼊数据 # 分区表create table test_t2(words string,frequency string) partitioned by (partdate string) row format delimited fields terminated by ',';# orc表CREATE TABLE IF NOT EXISTS bank.account_orc ( \\xa0`id_card` int, \\xa0`tran_time` string, \\xa0`name` string, \\xa0`cash` int \\xa0)stored as orc;insert into tablename values('col1', 'col2');INSERT INTO table_name (column1, column2, column3)VALUES(value1, value2, value3),(value4, value5, value6),(value7, value8, value9);INSERT OVERWRITE TABLE tbselect * from tb2;\", metadata={'source': 'examples/sql.pdf', 'page': 0})]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain_community.document_loaders import PyPDFLoader\n",
    "\n",
    "loader = PyPDFLoader(\"examples/sql.pdf\")\n",
    "pages = loader.load()\n",
    "pages"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 自定义文档加载器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import AsyncIterator, Iterator\n",
    "\n",
    "from langchain_core.document_loaders import BaseLoader\n",
    "from langchain_core.documents import Document\n",
    "\n",
    "\n",
    "class CustomDocumentLoader(BaseLoader):\n",
    "    \"\"\"An example document loader that reads a file line by line.\"\"\"\n",
    "\n",
    "    def __init__(self, file_path: str) -> None:\n",
    "        \"\"\"Initialize the loader with a file path.\n",
    "\n",
    "        Args:\n",
    "            file_path: The path to the file to load.\n",
    "        \"\"\"\n",
    "        self.file_path = file_path\n",
    "\n",
    "    def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments\n",
    "        \"\"\"A lazy loader that reads a file line by line.\n",
    "\n",
    "        When you're implementing lazy load methods, you should use a generator\n",
    "        to yield documents one by one.\n",
    "        \"\"\"\n",
    "        with open(self.file_path, encoding=\"utf-8\") as f:\n",
    "            line_number = 0\n",
    "            for line in f:\n",
    "                if not line.strip():\n",
    "                    continue\n",
    "                \n",
    "                yield Document(\n",
    "                    page_content=line,\n",
    "                    metadata={\"line_number\": line_number, \"source\": self.file_path},\n",
    "                )\n",
    "                line_number += 1\n",
    "\n",
    "    # alazy_load is OPTIONAL.\n",
    "    # If you leave out the implementation, a default implementation which delegates to lazy_load will be used!\n",
    "    async def alazy_load(\n",
    "        self,\n",
    "    ) -> AsyncIterator[Document]:  # <-- Does not take any arguments\n",
    "        \"\"\"An async lazy loader that reads a file line by line.\"\"\"\n",
    "        # Requires aiofiles\n",
    "        # Install with `pip install aiofiles`\n",
    "        # https://github.com/Tinche/aiofiles\n",
    "        import aiofiles\n",
    "\n",
    "        async with aiofiles.open(self.file_path, encoding=\"utf-8\") as f:\n",
    "            line_number = 0\n",
    "            async for line in f:\n",
    "                yield Document(\n",
    "                    page_content=line,\n",
    "                    metadata={\"line_number\": line_number, \"source\": self.file_path},\n",
    "                )\n",
    "                line_number += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'langchain_core.documents.base.Document'>  |  page_content='## 创建表\\n' metadata={'line_number': 0, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='```sql\\n' metadata={'line_number': 1, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='# 分区表\\n' metadata={'line_number': 2, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content=\"create table test_t2(words string,frequency string) partitioned by (partdate string) row format delimited fields terminated by ',';\\n\" metadata={'line_number': 3, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='# orc表\\n' metadata={'line_number': 4, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='CREATE TABLE IF NOT EXISTS bank.account_orc (\\n' metadata={'line_number': 5, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  `id_card` int,\\n' metadata={'line_number': 6, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  `tran_time` string,\\n' metadata={'line_number': 7, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  `name` string,\\n' metadata={'line_number': 8, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  `cash` int\\n' metadata={'line_number': 9, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  )\\n' metadata={'line_number': 10, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='stored as orc;\\n' metadata={'line_number': 11, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='```\\n' metadata={'line_number': 12, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='# 插入数据\\n' metadata={'line_number': 13, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='```sql\\n' metadata={'line_number': 14, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content=\"insert into tablename values('col1', 'col2');\\n\" metadata={'line_number': 15, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='INSERT INTO table_name (column1, column2, column3)\\n' metadata={'line_number': 16, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='VALUES\\n' metadata={'line_number': 17, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='(value1, value2, value3),\\n' metadata={'line_number': 18, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='(value4, value5, value6),\\n' metadata={'line_number': 19, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='(value7, value8, value9);\\n' metadata={'line_number': 20, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='INSERT OVERWRITE TABLE tb\\n' metadata={'line_number': 21, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='select * from tb2\\n' metadata={'line_number': 22, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content=';\\n' metadata={'line_number': 23, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='```' metadata={'line_number': 24, 'source': './examples/sql.md'}\n"
     ]
    }
   ],
   "source": [
    "loader = CustomDocumentLoader('./examples/sql.md')\n",
    "for doc in loader.lazy_load():\n",
    "    print(type(doc), ' | ', doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'langchain_core.documents.base.Document'>  |  page_content='## 创建表\\n' metadata={'line_number': 0, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='\\n' metadata={'line_number': 1, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='```sql\\n' metadata={'line_number': 2, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='# 分区表\\n' metadata={'line_number': 3, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content=\"create table test_t2(words string,frequency string) partitioned by (partdate string) row format delimited fields terminated by ',';\\n\" metadata={'line_number': 4, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='\\n' metadata={'line_number': 5, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='# orc表\\n' metadata={'line_number': 6, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='CREATE TABLE IF NOT EXISTS bank.account_orc (\\n' metadata={'line_number': 7, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  `id_card` int,\\n' metadata={'line_number': 8, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  `tran_time` string,\\n' metadata={'line_number': 9, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  `name` string,\\n' metadata={'line_number': 10, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  `cash` int\\n' metadata={'line_number': 11, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='  )\\n' metadata={'line_number': 12, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='stored as orc;\\n' metadata={'line_number': 13, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='```\\n' metadata={'line_number': 14, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='\\n' metadata={'line_number': 15, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='# 插入数据\\n' metadata={'line_number': 16, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='\\n' metadata={'line_number': 17, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='```sql\\n' metadata={'line_number': 18, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content=\"insert into tablename values('col1', 'col2');\\n\" metadata={'line_number': 19, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='\\n' metadata={'line_number': 20, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='\\n' metadata={'line_number': 21, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='INSERT INTO table_name (column1, column2, column3)\\n' metadata={'line_number': 22, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='VALUES\\n' metadata={'line_number': 23, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='(value1, value2, value3),\\n' metadata={'line_number': 24, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='(value4, value5, value6),\\n' metadata={'line_number': 25, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='(value7, value8, value9);\\n' metadata={'line_number': 26, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='\\n' metadata={'line_number': 27, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='\\n' metadata={'line_number': 28, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='INSERT OVERWRITE TABLE tb\\n' metadata={'line_number': 29, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='select * from tb2\\n' metadata={'line_number': 30, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content=';\\n' metadata={'line_number': 31, 'source': './examples/sql.md'}\n",
      "<class 'langchain_core.documents.base.Document'>  |  page_content='```' metadata={'line_number': 32, 'source': './examples/sql.md'}\n"
     ]
    }
   ],
   "source": [
    "async for doc in loader.alazy_load():\n",
    "    print(type(doc), ' | ', doc)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "langchain",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
